{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "31437747-634a-4e8c-8cc8-2846a439d455",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "\n",
    "from datasets import load_dataset\n",
    "from swebench import (\n",
    "    MAP_VERSION_TO_INSTALL,\n",
    "    get_logs_eval,\n",
    ")\n",
    "from tqdm.auto import tqdm"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ec9f433e-54a0-4767-bc43-c327d4231635",
   "metadata": {},
   "source": [
    "## Updating SWE-bench from `conda` sweep results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "dbf8c390-5902-450e-be77-81ecca1f1892",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = load_dataset('princeton-nlp/SWE-bench', split='test')\n",
    "data_map = {x['instance_id']: x for x in data}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "d72356c3-6b14-416b-90bd-b8652e974472",
   "metadata": {},
   "outputs": [],
   "source": [
    "conda_ids = [\n",
    "    \"py39_23.10.0-1\",\n",
    "    \"py39_23.9.0-0\",\n",
    "    \"py311_23.10.0-1\",\n",
    "    \"py311_23.9.0-0\",\n",
    "]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "db216774-a7ed-4748-b1b6-e271a951d176",
   "metadata": {},
   "source": [
    "* `folder` points at the location of the `test_202404` folder, which contains the `.log` files generated from running validation sweeps across different conda installation links.\n",
    "* `folder_out` is an arbitrary folder for storing the validation results. The information, particularly how many instances are successfully re-created, is used to inform which conda link to use for which repo/version"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "1ba0214a-3419-44ff-a4ce-ee647ad6ede7",
   "metadata": {},
   "outputs": [],
   "source": [
    "folder = \"/Users/johnbyang/Documents/Research/swe-bench/data/validation/test_202404\"\n",
    "folder_out = \"logs_per_conda\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f7ec779c-1898-4f85-b61e-34d21bf1b6c3",
   "metadata": {},
   "source": [
    "<hr />\n",
    "\n",
    "### Keep instances w/ 1+ F2P intact\n",
    "* Iterate through all conda_ids\n",
    "* Keep task instance if at least one pass to fail\n",
    "* If at least 1+ F2P, keep any and all P2P that are reproduced successfully\n",
    "* Save each experiment to a json, w/ `conda_id`, `new_dataset`, and `ids_reproduce_fail` fields"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "537bd507-caa5-4fa0-b9f1-6fe1e86ba455",
   "metadata": {},
   "outputs": [],
   "source": [
    "def survey_updated_logs(log_folder):\n",
    "    \"\"\"\n",
    "    ids_no_log: List of (instance_id, version) for which no log exists (likely an installation error)\n",
    "    ids_reproduce_fail: List of (instance_id, version) for which at least 1+ F2P could not be reproduced\n",
    "    new_dataset: List of updated dataset entries\n",
    "    changed_f2p: List of (instance_id, version) for which the F2P tests have changed\n",
    "    changed_p2p: List of (instance_id, version) for which the P2P tests have changed\n",
    "    \"\"\"\n",
    "    ids_no_log = []\n",
    "    ids_reproduce_fail = []\n",
    "    new_dataset = []\n",
    "    changed_f2p = []\n",
    "    changed_p2p = []\n",
    "    \n",
    "    for d in tqdm(data):\n",
    "        log_path = os.path.join(log_folder, f\"{d['instance_id']}.log\")\n",
    "        if not os.path.exists(log_path):\n",
    "            ids_no_log.append((d['instance_id'], d['version']))\n",
    "            continue\n",
    "        status_map, applied = get_logs_eval(log_path)\n",
    "        f2p_old = json.loads(d['FAIL_TO_PASS'])\n",
    "        p2p_old = json.loads(d['PASS_TO_PASS'])\n",
    "\n",
    "        # NOTE: Change to `all` to enforce f2ps must all exist\n",
    "        tests_reproduced = any([ \n",
    "            f2p in status_map and status_map[f2p] == 'PASSED'\n",
    "            for f2p in f2p_old\n",
    "        ])\n",
    "        if not tests_reproduced:\n",
    "            ids_reproduce_fail.append((d['instance_id'], d['version']))\n",
    "            continue\n",
    "\n",
    "        f2p_new = [k for k, v in status_map.items() if v == 'PASSED' and k in f2p_old]\n",
    "        p2p_new = [k for k, v in status_map.items() if v == 'PASSED' and k not in f2p_old]\n",
    "    \n",
    "        if sorted(f2p_old) != sorted(f2p_new):\n",
    "            changed_f2p.append((d['instance_id'], d['version']))\n",
    "        if sorted(p2p_old) != sorted(p2p_new):\n",
    "            changed_p2p.append((d['instance_id'], d['version']))\n",
    "\n",
    "        new_dataset.append({\n",
    "            **d,\n",
    "            # NOTE: Comment out following line to maintain original tests\n",
    "            'FAIL_TO_PASS': f2p_new,\n",
    "            'PASS_TO_PASS': p2p_new,\n",
    "        })\n",
    "    return ids_no_log, ids_reproduce_fail, new_dataset, changed_f2p, changed_p2p"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c311a13e-bdbf-4d56-9fc9-7ceff9e1ceba",
   "metadata": {},
   "source": [
    "<hr />\n",
    "\n",
    "### Identify Best Conda Link\n",
    "\n",
    "Loop to determine how many instances were reproduced per conda ID. The results for each conda ID are stored to a `.json` file in the `folder_out` directory."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "0ba91f42-b6ca-4a5e-82dc-14a57249dbaa",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "py39_23_10_0_1\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f3e98c3f7b454e6388919f99f58c5a20",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/2294 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "- # Reproduce Fail: 4\n",
      "- # New Dataset: 2290\n",
      "- # Changed (F2P): 32\n",
      "- # Changed (P2P): 242\n",
      "py39_23_9_0_0\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5f8d1f40285f440ab8f0c8eae36104b3",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/2294 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "- # Reproduce Fail: 10\n",
      "- # New Dataset: 230\n",
      "- # Changed (F2P): 7\n",
      "- # Changed (P2P): 72\n",
      "py311_23_10_0_1\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c0fbfc45ba96499ab9f8544071f7499a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/2294 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "- # Reproduce Fail: 10\n",
      "- # New Dataset: 232\n",
      "- # Changed (F2P): 5\n",
      "- # Changed (P2P): 73\n",
      "py311_23_9_0_0\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f82b49fed8ef45da93b98b1f30f852a5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/2294 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "- # Reproduce Fail: 10\n",
      "- # New Dataset: 229\n",
      "- # Changed (F2P): 4\n",
      "- # Changed (P2P): 67\n"
     ]
    }
   ],
   "source": [
    "# Loop through\n",
    "map_folder_id_to_conda_id = {}\n",
    "for conda_id in conda_ids:\n",
    "    temp = conda_id.replace('.', '_').replace('-', '_')\n",
    "    map_folder_id_to_conda_id[temp] = conda_id\n",
    "    conda_id = temp\n",
    "    print(conda_id)\n",
    "\n",
    "    log_folder = f\"{folder}/{conda_id}\"\n",
    "    ids_no_log, ids_reproduce_fail, new_dataset, changed_f2p, changed_p2p = survey_updated_logs(log_folder)\n",
    "    if len(ids_no_log) == 0 and len(ids_reproduce_fail) == 0 and len(new_dataset) == 0:\n",
    "        continue\n",
    "\n",
    "    print(f\"- # Reproduce Fail: {len(ids_reproduce_fail)}\")\n",
    "    print(f\"- # New Dataset: {len(new_dataset)}\")\n",
    "    print(f\"- # Changed (F2P): {len(changed_f2p)}\")\n",
    "    print(f\"- # Changed (P2P): {len(changed_p2p)}\")\n",
    "    with open(f\"{folder_out}/results_{conda_id}.json\", \"w\") as f:\n",
    "        json.dump({\n",
    "            \"conda_id\": conda_id,\n",
    "            \"new_dataset\": new_dataset,\n",
    "            \"ids_reproduce_fail\": ids_reproduce_fail,\n",
    "            \"changed_f2p\": changed_f2p,\n",
    "            \"changed_p2p\": changed_p2p,\n",
    "        }, fp=f)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "afb3718a-a9fc-4fd5-8232-b280432bc134",
   "metadata": {},
   "source": [
    "<hr />"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "31a3dc25-493c-4f08-8add-0a083dd095d5",
   "metadata": {},
   "source": [
    "#### Determine which Miniconda installer URL successfully captures the most instances for each repo/version pair"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "141477a3-467b-41da-91d0-9894688f91ee",
   "metadata": {},
   "outputs": [],
   "source": [
    "def determine_which_conda_id_is_best(repo, version):\n",
    "    \"\"\"\n",
    "    Given a repo and version, determine which conda_id recreates the most instances based on the number of instances\n",
    "    from that repo/version combo in new_dataset.\n",
    "    \"\"\"\n",
    "    max_count, conda_id_best = 0, None\n",
    "    for x in conda_ids:\n",
    "        results_path = f\"{folder_out}/results_{x.replace('-', '_').replace('.', '_')}.json\"\n",
    "        data_adjusted = json.load(open(results_path))\n",
    "        conda_id = results_path.split(\"/\")[1][len(\"results_\"):-len(\".json\")]\n",
    "        count = sum([1 for x in data_adjusted['new_dataset'] if x['repo'] == repo and x['version'] == version])\n",
    "        if count > max_count:\n",
    "            max_count = count\n",
    "            conda_id_best = conda_id\n",
    "    return max_count, conda_id_best"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "ee89dfd3-86c1-41cd-b13d-8a314be9d229",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "astropy/astropy 1.3 py39_23.10.0-1 11\n",
      "astropy/astropy 3.0 py39_23.10.0-1 6\n",
      "astropy/astropy 3.1 py39_23.10.0-1 5\n",
      "astropy/astropy 4.2 py39_23.10.0-1 1\n",
      "astropy/astropy 4.3 py39_23.10.0-1 11\n",
      "astropy/astropy 5.0 py39_23.10.0-1 29\n",
      "astropy/astropy 5.1 py39_23.10.0-1 23\n",
      "astropy/astropy 5.2 py39_23.10.0-1 9\n",
      "django/django 1.9 py39_23.10.0-1 1\n",
      "django/django 1.10 py39_23.10.0-1 1\n",
      "django/django 1.11 py39_23.10.0-1 4\n",
      "django/django 2.0 py39_23.10.0-1 2\n",
      "django/django 2.1 py39_23.10.0-1 2\n",
      "django/django 2.2 py39_23.10.0-1 6\n",
      "django/django 3.0 py39_23.10.0-1 137\n",
      "django/django 3.1 py39_23.10.0-1 128\n",
      "django/django 3.2 py39_23.10.0-1 157\n",
      "django/django 4.0 py39_23.10.0-1 123\n",
      "django/django 4.1 py39_23.10.0-1 124\n",
      "django/django 4.2 py39_23.10.0-1 84\n",
      "django/django 5.0 py39_23.10.0-1 81\n",
      "matplotlib/matplotlib 3.5 py39_23.10.0-1 40\n",
      "matplotlib/matplotlib 3.6 py39_23.10.0-1 40\n",
      "matplotlib/matplotlib 3.7 py39_23.10.0-1 61\n",
      "matplotlib/matplotlib 3.1 py39_23.10.0-1 2\n",
      "matplotlib/matplotlib 3.2 py39_23.10.0-1 1\n",
      "matplotlib/matplotlib 3.3 py39_23.10.0-1 4\n",
      "matplotlib/matplotlib 3.4 py39_23.10.0-1 26\n",
      "matplotlib/matplotlib 3.0 py39_23.10.0-1 9\n",
      "mwaskom/seaborn 0.11 py39_23.10.0-1 1\n",
      "mwaskom/seaborn 0.12 py39_23.10.0-1 16\n",
      "mwaskom/seaborn 0.13 py39_23.10.0-1 5\n",
      "pallets/flask 2.0 py39_23.10.0-1 4\n",
      "pallets/flask 2.1 py39_23.10.0-1 1\n",
      "pallets/flask 2.2 py39_23.10.0-1 2\n",
      "pallets/flask 2.3 py39_23.10.0-1 4\n",
      "psf/requests 0.13 py39_23.10.0-1 1\n",
      "psf/requests 0.14 py39_23.10.0-1 1\n",
      "psf/requests 1.1 py39_23.10.0-1 1\n",
      "psf/requests 1.2 py39_23.10.0-1 4\n",
      "psf/requests 2.0 py39_23.10.0-1 9\n",
      "psf/requests 2.2 py39_23.10.0-1 2\n",
      "psf/requests 2.3 py39_23.10.0-1 5\n",
      "psf/requests 2.4 py39_23.9.0-0 2\n",
      "psf/requests 2.5 py39_23.10.0-1 3\n",
      "psf/requests 2.7 py39_23.10.0-1 3\n",
      "psf/requests 2.8 py39_23.10.0-1 2\n",
      "psf/requests 2.9 py39_23.10.0-1 1\n",
      "psf/requests 2.10 py39_23.10.0-1 1\n",
      "psf/requests 2.12 py39_23.10.0-1 2\n",
      "psf/requests 2.17 py39_23.10.0-1 1\n",
      "psf/requests 2.18 py39_23.10.0-1 1\n",
      "psf/requests 2.19 py39_23.10.0-1 1\n",
      "psf/requests 2.22 py39_23.10.0-1 1\n",
      "psf/requests 2.26 py39_23.10.0-1 1\n",
      "psf/requests 2.27 py39_23.10.0-1 1\n",
      "psf/requests 3.0 py39_23.10.0-1 1\n",
      "pydata/xarray 0.12 py39_23.10.0-1 63\n",
      "pydata/xarray 0.18 py39_23.10.0-1 4\n",
      "pydata/xarray 0.19 py39_23.10.0-1 3\n",
      "pydata/xarray 0.20 py39_23.10.0-1 1\n",
      "pydata/xarray 2022.03 py39_23.10.0-1 8\n",
      "pydata/xarray 2022.06 py39_23.10.0-1 18\n",
      "pydata/xarray 2022.09 py39_23.10.0-1 13\n",
      "pylint-dev/pylint 2.10 py39_23.10.0-1 5\n",
      "pylint-dev/pylint 2.11 py39_23.10.0-1 4\n",
      "pylint-dev/pylint 2.13 py39_23.10.0-1 8\n",
      "pylint-dev/pylint 2.14 py39_23.10.0-1 11\n",
      "pylint-dev/pylint 2.15 py39_23.10.0-1 9\n",
      "pylint-dev/pylint 2.16 py39_23.10.0-1 1\n",
      "pylint-dev/pylint 2.17 py39_23.10.0-1 3\n",
      "pylint-dev/pylint 2.8 py39_23.10.0-1 6\n",
      "pylint-dev/pylint 2.9 py39_23.10.0-1 4\n",
      "pylint-dev/pylint 3.0 py39_23.10.0-1 6\n",
      "pytest-dev/pytest 4.4 py39_23.10.0-1 3\n",
      "pytest-dev/pytest 4.5 py39_23.10.0-1 4\n",
      "pytest-dev/pytest 4.6 py39_23.10.0-1 8\n",
      "pytest-dev/pytest 5.0 py39_23.10.0-1 5\n",
      "pytest-dev/pytest 5.1 py39_23.10.0-1 3\n",
      "pytest-dev/pytest 5.2 py39_23.10.0-1 6\n",
      "pytest-dev/pytest 5.3 py39_23.10.0-1 2\n",
      "pytest-dev/pytest 5.4 py39_23.10.0-1 19\n",
      "pytest-dev/pytest 6.0 py39_23.10.0-1 11\n",
      "pytest-dev/pytest 6.2 py39_23.10.0-1 7\n",
      "pytest-dev/pytest 6.3 py39_23.10.0-1 8\n",
      "pytest-dev/pytest 7.0 py39_23.10.0-1 11\n",
      "pytest-dev/pytest 7.1 py39_23.10.0-1 5\n",
      "pytest-dev/pytest 7.2 py39_23.10.0-1 15\n",
      "pytest-dev/pytest 7.4 py39_23.10.0-1 6\n",
      "pytest-dev/pytest 8.0 py39_23.10.0-1 6\n",
      "scikit-learn/scikit-learn 0.20 py39_23.10.0-1 60\n",
      "scikit-learn/scikit-learn 0.21 py39_23.10.0-1 60\n",
      "scikit-learn/scikit-learn 0.22 py39_23.10.0-1 72\n",
      "scikit-learn/scikit-learn 1.3 py39_23.10.0-1 35\n",
      "scikit-learn/scikit-learn 1.4 py39_23.10.0-1 2\n",
      "sphinx-doc/sphinx 3.0 py39_23.10.0-1 12\n",
      "sphinx-doc/sphinx 3.1 py39_23.10.0-1 17\n",
      "sphinx-doc/sphinx 3.2 py39_23.10.0-1 15\n",
      "sphinx-doc/sphinx 3.3 py39_23.10.0-1 11\n",
      "sphinx-doc/sphinx 3.4 py39_23.10.0-1 15\n",
      "sphinx-doc/sphinx 3.5 py39_23.10.0-1 18\n",
      "sphinx-doc/sphinx 4.0 py39_23.10.0-1 16\n",
      "sphinx-doc/sphinx 4.1 py39_23.10.0-1 19\n",
      "sphinx-doc/sphinx 4.2 py39_23.10.0-1 5\n",
      "sphinx-doc/sphinx 4.3 py39_23.10.0-1 12\n",
      "sphinx-doc/sphinx 4.4 py39_23.10.0-1 7\n",
      "sphinx-doc/sphinx 5.0 py39_23.10.0-1 16\n",
      "sphinx-doc/sphinx 5.1 py39_23.10.0-1 5\n",
      "sphinx-doc/sphinx 5.2 py39_23.10.0-1 4\n",
      "sphinx-doc/sphinx 5.3 py39_23.10.0-1 1\n",
      "sphinx-doc/sphinx 6.2 py39_23.10.0-1 3\n",
      "sphinx-doc/sphinx 7.1 py39_23.10.0-1 5\n",
      "sphinx-doc/sphinx 7.2 py39_23.10.0-1 6\n",
      "sympy/sympy 1.0 py39_23.10.0-1 36\n",
      "sympy/sympy 1.1 py39_23.10.0-1 88\n",
      "sympy/sympy 1.10 py39_23.10.0-1 11\n",
      "sympy/sympy 1.11 py39_23.10.0-1 13\n",
      "sympy/sympy 1.12 py39_23.10.0-1 17\n",
      "sympy/sympy 1.2 py39_23.10.0-1 6\n",
      "sympy/sympy 1.4 py39_23.10.0-1 43\n",
      "sympy/sympy 1.5 py39_23.10.0-1 60\n",
      "sympy/sympy 1.6 py39_23.10.0-1 48\n",
      "sympy/sympy 1.7 py39_23.10.0-1 18\n",
      "sympy/sympy 1.8 py39_23.10.0-1 17\n",
      "sympy/sympy 1.9 py39_23.10.0-1 26\n",
      "sympy/sympy 1.13 py39_23.10.0-1 1\n"
     ]
    }
   ],
   "source": [
    "repo_version_to_conda_id = {}\n",
    "total = 0\n",
    "test_set_repo_version_pairs = set([(x['repo'], x['version']) for x in data_map.values()])\n",
    "\n",
    "# Loop through all repo/version combos\n",
    "for repo, v in MAP_VERSION_TO_INSTALL.items():\n",
    "    if repo not in set(data['repo']):\n",
    "        # Do not proceed for repos that are not in test set\n",
    "        continue\n",
    "\n",
    "    repo_version_to_conda_id[repo] = {}\n",
    "    for version in list(v.keys()):\n",
    "        if (repo, version) not in test_set_repo_version_pairs:\n",
    "            # Do not proceed for (repo, version) pairs that are not in test set\n",
    "            continue\n",
    "\n",
    "        # Determine which conda_id is best for this repo/version combo\n",
    "        max_count, conda_id_best = determine_which_conda_id_is_best(repo, version)\n",
    "        if conda_id_best is not None:\n",
    "            conda_id_best = map_folder_id_to_conda_id[conda_id_best]\n",
    "\n",
    "        # Bookkeeping (log # of recreated instances and store the best conda_id)\n",
    "        total += max_count\n",
    "        repo_version_to_conda_id[repo][version] = conda_id_best\n",
    "        print(repo, version, conda_id_best, max_count)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1b53ab26-5481-488d-80e1-9f99c1c0bced",
   "metadata": {},
   "source": [
    "Create dataset from the above repo/version-to-conda-ID map. Use the conda ID that recreates the most task instances for each repo/version pair."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "aecf61ca-890d-4bed-920a-b15a4f94e0fb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c7e4ff85d13b45e5a9ab3bc63b999826",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/12 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2291\n"
     ]
    }
   ],
   "source": [
    "new_dataset_agg = []\n",
    "for repo, version_map in tqdm(repo_version_to_conda_id.items()):\n",
    "    for version, conda_id in version_map.items():\n",
    "        if conda_id != None:\n",
    "            conda_id = conda_id.replace('.', '_').replace('-', '_')\n",
    "            adjusted_data = json.load(open(f\"{folder_out}/results_{conda_id}.json\"))['new_dataset']\n",
    "            adjusted_data = [x for x in adjusted_data if x['repo'] == repo and x['version'] == version]\n",
    "            new_dataset_agg.extend(adjusted_data)\n",
    "\n",
    "new_dataset_agg = sorted(new_dataset_agg, key=lambda x: x['instance_id'])\n",
    "print(len(new_dataset_agg))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "aa5cbf98-8497-4176-916b-c67071b06111",
   "metadata": {},
   "source": [
    "<hr />"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7e859784-a760-4891-9e71-860513abdd12",
   "metadata": {},
   "source": [
    "#### What task instances are not resolved?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "4f05f1c8-ca88-4cde-b1e6-b1b96f4dcc26",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set subraction between original test set and new dataset\n",
    "failing_inst_ids = sorted([\n",
    "    x for x in list(\n",
    "        set([(x['instance_id'], x['version']) for x in data]) -\n",
    "        set([(x['instance_id'], x['version']) for x in new_dataset_agg])\n",
    "    )])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "8cb14cd7-fa6a-4751-ac3e-720c9134f293",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('matplotlib__matplotlib-26399', '3.7'),\n",
       " ('sympy__sympy-11818', '1.0'),\n",
       " ('sympy__sympy-13865', '1.1')]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Show failinting instance IDs\n",
    "failing_inst_ids"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
